Basic data analysis is done, including plotting sum and mean of item_cnt_day for each month to find some patterns, exploring missing values, inspecting test set etc
# Load all the given datasets
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
item_categories_df = pd.read_csv("./train_data/item_categories.csv")
items_df = pd.read_csv("./train_data/items.csv")
sales_train_df = pd.read_csv("./train_data/sales_train.csv")
shops_df = pd.read_csv("./train_data/shops.csv")
test_df = pd.read_csv("./test_data/test.csv")
test_df = pd.merge(test_df, items_df, on='item_id')
sub_df = pd.read_csv("./submission/sample_submission.csv")
# Creating a combined train df
df = pd.merge(sales_train_df, items_df, on='item_id')
del df['item_name']
df['date'] = pd.to_datetime(sales_train_df['date'], format='%d.%m.%Y')
df.head()
# Categories
import numpy as np
print ("\x1b[1;31m Total Shops in the given training dataset: \x1b[0m",len(np.unique(df.shop_id)))
print ("\x1b[1;31m Total Items in the given training dataset: \x1b[0m",len(np.unique(df.item_id)))
print ("\x1b[1;31m Total Categories in the given training dataset: \x1b[0m",len(np.unique(df.item_category_id)))
print ("\x1b[1;32m Timeline \x1b[0m")
print ("\x1b[1;31m Start date for the training dataset: \x1b[0m",np.min(df.date))
print ("\x1b[1;31m End date for the training dataset: \x1b[0m",np.max(df.date))
print ("\n\x1b[1;32m Item Price \x1b[0m")
print ("\x1b[1;31m Min item Price in the training dataset: \x1b[0m",np.min(df.item_price))
print ("\x1b[1;31m Max item Price in the training dataset: \x1b[0m",np.max(df.item_price))
print ("\n\x1b[1;32m Item Sold \x1b[0m")
print ("\x1b[1;31m Min item sold in the training dataset: \x1b[0m",np.min(df.item_cnt_day))
print ("\x1b[1;31m Max item sold in the training dataset: \x1b[0m",np.max(df.item_cnt_day))
Check for ouliers in item price column
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
SMALL_SIZE = 16
MEDIUM_SIZE = 10
BIGGER_SIZE = 1
plt.rc('font', size=SMALL_SIZE) # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)
# Data
fig, axes = plt.subplots(figsize=(18, 10))
axes.set_title('Items Sold per Day',\
fontweight="bold",loc='center',fontsize=24)
axes.set_xlabel('Month',fontweight="bold",fontsize=16)
axes.set_ylabel('Item Counts',fontweight="bold",fontsize=16)
plt.rcParams.update({'font.size': 22})
# multiple line plot
plt.scatter( 'date', 'item_cnt_day', data=df, marker='o',color='red')
plt.plot( df.date , [500]*(len(df.item_cnt_day)),color='black')
plt.legend()
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
SMALL_SIZE = 16
MEDIUM_SIZE = 10
BIGGER_SIZE = 1
plt.rc('font', size=SMALL_SIZE) # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)
# Data
fig, axes = plt.subplots(figsize=(18, 10))
axes.set_title('Price per Item',\
fontweight="bold",loc='center',fontsize=24)
axes.set_xlabel('Item',fontweight="bold",fontsize=16)
axes.set_ylabel('Item Counts',fontweight="bold",fontsize=16)
plt.rcParams.update({'font.size': 22})
# multiple line plot
plt.scatter( 'item_id', 'item_price', data=df, marker='o',color='green')
plt.plot( df.item_id , [43000]*(len(df.item_cnt_day)),color='black')
plt.legend()
From the above plots lets remove outliers from df.
Range for item price: > 0 and <=43000 Range for item count day: > 0 and <=500
train_df = df[(df.item_price > 0) & (df.item_price <=43000) & (df.item_cnt_day > 0) & (df.item_cnt_day <= 500)]
train_df.head()
train_df['day'] = train_df['date'].dt.day
train_df['month'] = train_df['date'].dt.month
train_df['year'] = train_df['date'].dt.year
train_df['total_price'] = train_df['item_price'] * train_df['item_cnt_day']
train_df.head(3)
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
SMALL_SIZE = 16
MEDIUM_SIZE = 10
BIGGER_SIZE = 1
plt.rc('font', size=SMALL_SIZE) # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)
# Data
fig, axes = plt.subplots(figsize=(18, 10))
axes.set_xticks([1,2,3,4,5,6,7,8,9,10,11,12])
axes.set_title('Monthly Items Sold per Year',\
fontweight="bold",loc='left',fontsize=24)
axes.set_xlabel('Month',fontweight="bold",fontsize=16)
axes.set_ylabel('Item Counts',fontweight="bold",fontsize=16)
plt.rcParams.update({'font.size': 22})
# multiple line plot
plt.plot( 'month', 'item_cnt_day_2013', data=df_2013_2014_2015, marker='o', markerfacecolor='blue', markersize=12, color='red', linewidth=4)
plt.plot( 'month', 'item_cnt_day_2014', data=df_2013_2014_2015, marker='o', markerfacecolor='blue', markersize=12, color='green', linewidth=4)
plt.plot( 'month', 'item_cnt_day_2015', data=df_2013_2014_2015, marker='o', markerfacecolor='blue', markersize=12, color='black', linewidth=4)
plt.legend()
fig = plt.figure(figsize=(18, 50))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
categories = test_df.item_category_id.unique()
fig.suptitle('Monthly Items Sold per Year per Category',\
fontweight="bold",fontsize=24)
fig.tight_layout(rect=[0, 0.03, 1, 0.95])
for i in range(1, 63):
ax = fig.add_subplot(16, 4, i)
temp_df_category = train_df[train_df.item_category_id==categories[i-1]]
ax.set_title("Category : "+str(categories[i-1]),fontweight="bold",loc='center')
temp_df = temp_df_category.groupby(['year','month']).agg({'item_cnt_day':'sum'}).reset_index()
df_2013 = temp_df[temp_df.year==2013]
df_2014 = temp_df[temp_df.year==2014]
df_2015 = temp_df[temp_df.year==2015]
if len(df_2013) > 0:
plt.plot( 'month', 'item_cnt_day', data=df_2013, marker='o', markerfacecolor='blue', markersize=2, color='red', linewidth=1)
if len(df_2014) > 0:
plt.plot( 'month', 'item_cnt_day', data=df_2014, marker='o', markerfacecolor='blue', markersize=2, color='green', linewidth=1)
if len(df_2015) > 0:
plt.plot( 'month', 'item_cnt_day', data=df_2015, marker='o', markerfacecolor='blue', markersize=2, color='black', linewidth=1)
#plt.legend()
fig.tight_layout()
fig.subplots_adjust(top=0.96)
fig = plt.figure(figsize=(18, 50))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
for i in range(1, 63):
ax = fig.add_subplot(16, 4, i)
temp_df_category = train_df[train_df.item_category_id==categories[i-1]]
ax.set_title("Category : "+str(categories[i-1]),fontweight="bold",loc='center')
temp_df = temp_df_category.groupby(['year']).agg({'item_cnt_day':'sum'}).reset_index()
df_2013 = temp_df[temp_df.year==2013]
df_2014 = temp_df[temp_df.year==2014]
df_2015 = temp_df[temp_df.year==2015]
if len(df_2013) > 0:
ax.bar( 'year', 'item_cnt_day', data=df_2013, color='red', linewidth=1,align='center')
if len(df_2014) > 0:
ax.bar( 'year', 'item_cnt_day', data=df_2014, color='green', linewidth=1,align='center')
if len(df_2015) > 0:
ax.bar( 'year', 'item_cnt_day', data=df_2015, color='black', linewidth=1,align='center')
fig.suptitle('Yearly Items Sold per Category',\
fontweight="bold",fontsize=24)
fig.tight_layout()
fig.subplots_adjust(top=0.96)
From the first plot we couldnt really figure out how volume of goods sold is behaving with respect to each category. There is no clear trend each year
From the second plot, we can see the bigger picture year wise. There is trend of decline or increase per category yearly
fig = plt.figure(figsize=(18, 50))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
for i in range(1, 63):
ax = fig.add_subplot(16, 4, i)
temp_df_category = train_df[train_df.item_category_id==categories[i-1]]
ax.set_title("Category : "+str(categories[i-1]),fontweight="bold",loc='center')
temp_df = temp_df_category.groupby(['date_block_num']).agg({'item_cnt_day':'sum'}).reset_index()
if len(temp_df) > 0:
z = np.polyfit(temp_df.date_block_num, temp_df.item_cnt_day, 1)
p = np.poly1d(z)
plt.plot(temp_df.date_block_num,p(temp_df.date_block_num),"b--")
plt.plot( 'date_block_num', 'item_cnt_day', data=temp_df, marker='o', markerfacecolor='blue', markersize=2, color='red', linewidth=1)
fig.suptitle('3 Year monthly goods sold trend per category',\
fontweight="bold",fontsize=24)
fig.tight_layout()
fig.subplots_adjust(top=0.96)
fig = plt.figure(figsize=(25,25))
fig.subplots_adjust(hspace=0.2, wspace=0.2)
dates = train_df.date_block_num.unique()
for i in range(1, 13):
ax = fig.add_subplot(4, 3, i)
temp_df_category = train_df[train_df.month==i]
ax.set_title("Month : "+str(i),fontweight="bold",loc='center',fontsize=24)
temp_df = temp_df_category.groupby(['item_category_id']).agg({'item_cnt_day':'sum'}).reset_index()
temp_df = temp_df.sort_values('item_cnt_day',ascending=False).head(15)
temp_df['item_category_id'] = temp_df['item_category_id'].astype(str)
if len(temp_df) > 0:
plt.bar( 'item_category_id', 'item_cnt_day', data=temp_df, linewidth=1)
#plt.legend()
fig.suptitle('Top 15 categories having maximum items sold per month',\
fontweight="bold",fontsize=34)
fig.subplots_adjust(top=0.93)
- Convert the russian category names and figure out if we can reduce the number of categories
from googletrans import Translator
translator = Translator()
def translate_to_english(x):
return translator.translate(x, src='ru').text
item_categories_df['english_item_category_name'] = item_categories_df.item_category_name.apply(lambda x: translate_to_english(x))
def higher_category(x):
if '-' in x:
return x.split('-')[0]
else:
return x
item_categories_df['higher_category'] = item_categories_df.english_item_category_name.apply(lambda x: higher_category(x))
item_categories_df = pd.read_csv("./train_data/english.csv")
item_categories_df = item_categories_df.drop(columns=['Unnamed: 0'])
item_categories_df.head()
higher_categories = item_categories_df.higher_category.unique()
print ("\x1b[1;32m Total Higher Categories Summary \x1b[0m")
print ("\x1b[1;31m We have inferred \x1b[0m",len(higher_categories), "\x1b[1;31m from a total of\x1b[0m",
len(item_categories_df.item_category_id.unique()),"\x1b[1;31m sub categories \x1b[0m \n")
print ("\x1b[1;32m The 19 categories are \x1b[0m")
for idx,category in enumerate(higher_categories):
print ("\x1b[1;31m Category ",idx+1," \x1b[0m",category)
train_df = pd.merge(train_df, item_categories_df, on='item_category_id').drop(columns = ['item_category_name'])
train_df.head()
fig = plt.figure(figsize=(18, 30))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
for i in range(1, 20):
ax = fig.add_subplot(7, 3, i)
temp_df_category = train_df[train_df.higher_category==higher_categories[i-1]]
ax.set_title("Category : "+str(higher_categories[i-1]),fontweight="bold",loc='center')
temp_df = temp_df_category.groupby(['date_block_num']).agg({'item_cnt_day':'sum'}).reset_index()
if len(temp_df) > 0:
z = np.polyfit(temp_df.date_block_num, temp_df.item_cnt_day, 1)
p = np.poly1d(z)
plt.plot(temp_df.date_block_num,p(temp_df.date_block_num),"b--")
plt.plot( 'date_block_num', 'item_cnt_day', data=temp_df, marker='o',\
markerfacecolor='blue', markersize=2, color='red', linewidth=1)
fig.suptitle('3 Year monthly goods sold trend per higher category',\
fontweight="bold",fontsize=24)
fig.tight_layout()
fig.subplots_adjust(top=0.95)
The trendlines are more clear with the higher categories. Its also intresting to see some recurring peaks in the time line.
It would be intresting to figure out the reason for the peaks
fig = plt.figure(figsize=(18, 60))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
for i in range(1, 20):
ax = fig.add_subplot(16, 4, i)
temp_df_category = train_df[train_df.higher_category==higher_categories[i-1]]
ax.set_title("Category : "+str(higher_categories[i-1]),fontweight="bold",loc='center')
temp_df = temp_df_category.groupby(['year','month']).agg({'item_cnt_day':'sum'}).reset_index()
df_2013 = temp_df[temp_df.year==2013]
df_2014 = temp_df[temp_df.year==2014]
df_2015 = temp_df[temp_df.year==2015]
if len(df_2013) > 0:
plt.plot( 'month', 'item_cnt_day', data=df_2013, marker='o', markerfacecolor='blue', markersize=2, color='red', linewidth=1)
if len(df_2014) > 0:
plt.plot( 'month', 'item_cnt_day', data=df_2014, marker='o', markerfacecolor='blue', markersize=2, color='green', linewidth=1)
if len(df_2015) > 0:
plt.plot( 'month', 'item_cnt_day', data=df_2015, marker='o', markerfacecolor='blue', markersize=2, color='black', linewidth=1)
fig.suptitle('Volume of items sold for the last three years parallely',\
fontweight="bold",fontsize=24)
fig.tight_layout()
fig.subplots_adjust(top=0.96)
fig = plt.figure(figsize=(18, 60))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
for i in range(1, 60):
ax = fig.add_subplot(20, 3, i)
temp_df_category = train_df[train_df.shop_id==i-1]
ax.set_title("Shop : "+str(i-1),fontweight="bold",loc='center')
temp_df = temp_df_category.groupby(['date_block_num']).agg({'total_price':'sum'}).reset_index()
if len(temp_df) > 0:
z = np.polyfit(temp_df.date_block_num, temp_df.total_price, 1)
p = np.poly1d(z)
plt.plot(temp_df.date_block_num,p(temp_df.date_block_num),"b--")
plt.plot( 'date_block_num', 'total_price', data=temp_df, marker='o', markerfacecolor='blue', markersize=2, color='green', linewidth=1)
#plt.legend()
fig.suptitle('Revenue Trend for each shop per month',\
fontweight="bold",fontsize=24)
fig.tight_layout()
fig.subplots_adjust(top=0.96)
Its intresting to see the revenue trend being very similar among most of the shops. Lets see it stacked up all together
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
SMALL_SIZE = 16
MEDIUM_SIZE = 10
BIGGER_SIZE = 1
plt.rc('font', size=SMALL_SIZE) # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)
# Data
fig, axes = plt.subplots(figsize=(18, 8))
axes.set_xticks(list(range(34)))
axes.set_title('Revenue Trends for all shops for the last 3 years',\
fontweight="bold",loc='center',fontsize=24)
axes.set_xlabel('Month',fontweight="bold",fontsize=16)
axes.set_ylabel('Item Counts',fontweight="bold",fontsize=16)
plt.rcParams.update({'font.size': 22})
for i in range(1, 60):
temp_df_category = train_df[(train_df.shop_id==i-1) & (train_df.date_block_num<12)]
temp_df_2 = temp_df_category.groupby(['date_block_num']).agg({'item_cnt_day':'sum'}).reset_index()
if len(temp_df_2) > 0:
plt.plot( 'date_block_num', 'item_cnt_day', data=temp_df_2, marker='o', markerfacecolor='blue', markersize=2, color='black', linewidth=1)
temp_df_category = train_df[(train_df.shop_id==i-1) & (train_df.date_block_num>=11) & (train_df.date_block_num<=23)]
temp_df_2 = temp_df_category.groupby(['date_block_num']).agg({'item_cnt_day':'sum'}).reset_index()
if len(temp_df_2) > 0:
plt.plot( 'date_block_num', 'item_cnt_day', data=temp_df_2, marker='o', markerfacecolor='blue', markersize=2, color='red', linewidth=1)
temp_df_category = train_df[(train_df.shop_id==i-1) & (train_df.date_block_num>=23)]
temp_df_2 = temp_df_category.groupby(['date_block_num']).agg({'item_cnt_day':'sum'}).reset_index()
if len(temp_df_2) > 0:
plt.plot( 'date_block_num', 'item_cnt_day', data=temp_df_2, marker='o', markerfacecolor='blue', markersize=2, color='green', linewidth=1)
#plt.legend()
train_df['day_name'] = train_df['date'].dt.day_name()
train_df['day_code'] = train_df['date'].apply(lambda x: x.weekday())
train_df['day_name_short'] = train_df['day_name'].apply(lambda x: x[:3])
train_df.head(3)
fig = plt.figure(figsize=(18, 60))
fig.subplots_adjust(hspace=0.2, wspace=0.2)
for i in range(1, 18):
ax = fig.add_subplot(16, 4, i)
temp_df_category = train_df[train_df.higher_category==higher_categories[i-1]]
ax.set_title("Category : "+str(higher_categories[i-1]),fontweight="bold",loc='center')
temp_df = temp_df_category.groupby(['year','day_name_short','day_code']).agg({'item_cnt_day':'sum'}).reset_index()
temp_df = temp_df.sort_values('day_code',ascending=True)
plt.xticks(rotation=45)
df_2013 = temp_df[temp_df.year==2013]
df_2014 = temp_df[temp_df.year==2014]
df_2015 = temp_df[temp_df.year==2015]
if len(df_2013) > 0:
plt.plot( 'day_name_short', 'item_cnt_day', data=df_2013, marker='o', markerfacecolor='blue', markersize=2, color='red', linewidth=1)
if len(df_2014) > 0:
plt.plot( 'day_name_short', 'item_cnt_day', data=df_2014, marker='o', markerfacecolor='blue', markersize=2, color='green', linewidth=1)
if len(df_2015) > 0:
plt.plot( 'day_name_short', 'item_cnt_day', data=df_2015, marker='o', markerfacecolor='blue', markersize=2, color='black', linewidth=1)
#plt.legend()
fig.suptitle('Total number items sold in a week per category',\
fontweight="bold",fontsize=24)
fig.tight_layout()
fig.subplots_adjust(top=0.96)
import matplotlib.cm as cm
fig = plt.figure(figsize=(18, 70))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
colors = cm.rainbow(np.linspace(0, 1, 18))
for i in range(1, 18):
ax = fig.add_subplot(16, 4, i)
temp_df_category = train_df[train_df.higher_category==higher_categories[i-1]]
ax.set_title("Category : "+str(higher_categories[i-1]),fontweight="bold",loc='center')
if len(temp_df_category) > 0:
plt.scatter( 'item_price', 'item_cnt_day', data=temp_df_category, marker='o', color=colors[i-1], linewidth=1)
fig.suptitle('Price distribution to goods sold per category',\
fontweight="bold",fontsize=24)
fig.tight_layout()
fig.subplots_adjust(top=0.96)
russian_holidays = pd.read_html('https://www.timeanddate.com/holidays/russia/2013')
russian_holidays_2013 = russian_holidays[0][~russian_holidays[0].isna().all(axis=1)]
russian_holidays_2013.columns = ["holiday_date","holiday_day","holiday_name","holiday_type"]
russian_holidays_2013['holiday_date'] = pd.to_datetime(russian_holidays_2013['holiday_date']+' 2013',format='%b %d %Y')
russian_holidays_2013.head()
russian_holidays = pd.read_html('https://www.timeanddate.com/holidays/russia/2014')
russian_holidays_2014 = russian_holidays[0][~russian_holidays[0].isna().all(axis=1)]
russian_holidays_2014.columns = ["holiday_date","holiday_day","holiday_name","holiday_type"]
russian_holidays_2014['holiday_date'] = pd.to_datetime(russian_holidays_2014['holiday_date']+' 2014',format='%b %d %Y')
russian_holidays = pd.read_html('https://www.timeanddate.com/holidays/russia/2015')
russian_holidays_2015 = russian_holidays[0][~russian_holidays[0].isna().all(axis=1)]
russian_holidays_2015.columns = ["holiday_date","holiday_day","holiday_name","holiday_type"]
russian_holidays_2015['holiday_date'] = pd.to_datetime(russian_holidays_2015['holiday_date']+' 2015',format='%b %d %Y')
russian_holidays = russian_holidays_2013.append(russian_holidays_2014)
russian_holidays = russian_holidays.append(russian_holidays_2015)
russian_holidays['is_holiday'] = 1
#russian_holidays.to_json("./train_data/russian_holidays.json",orient="records")
russian_holidays.head()
russian_holidays = russian_holidays.rename({'holiday_date':'date'},axis=1)
train_df = pd.merge(train_df, russian_holidays, on='date',how="left")
train_df['holiday_name'] = train_df['holiday_name'].fillna('No Holiday')
train_df['holiday_type'] = train_df['holiday_type'].fillna('No Holiday')
train_df['is_holiday'] = train_df['is_holiday'].fillna(0)
train_df = train_df.drop(columns=['holiday_day'])
train_df.head()
#updated_m_df['day_code'] = updated_m_df['date'].apply(lambda x: x.weekday())
#updated_m_df['day_name_short'] = updated_m_df['day_name'].apply(lambda x: x[:3])
#updated_m_df_holiday.to_json("./train_data/final_transform.json",orient="records")
arr_val = train_df['is_holiday'].values
adjusted_is_holiday = []
for idx, val in enumerate(arr_val):
if val == 1.0:
adjusted_is_holiday.append(1.0)
else:
if 1.0 in arr_val[idx:idx+7]:
adjusted_is_holiday.append(1.0)
else:
adjusted_is_holiday.append(0.0)
train_df['adjusted_is_holiday'] = adjusted_is_holiday
train_df.head(3)
train_df['week_number_of_year'] =train_df['date'].dt.week
def in_week(x):
if 1.0 in x.values:
return True
else:
return False
holiday_week_df = train_df.groupby(['year','week_number_of_year']).agg({'is_holiday':in_week}).reset_index()
holiday_week_df = holiday_week_df.rename({'is_holiday':'has_holiday_in_week'},axis=1)
train_df = pd.merge(train_df, holiday_week_df, on=['year','week_number_of_year'], how='left')
train_df.head(3)
fig = plt.figure(figsize=(18, 14))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
weekly_update = train_df.groupby(['year','week_number_of_year','has_holiday_in_week']).item_cnt_day.sum().reset_index()
years = [2013,2014,2015]
for i in range(1, 4):
ax = fig.add_subplot(3, 1, i)
temp_df_category = weekly_update[weekly_update.year==years[i-1]]
ax.set_title("Year : "+str(years[i-1]),fontweight="bold",loc='center')
temp_df_category_h_week = temp_df_category[temp_df_category.has_holiday_in_week]
temp_df_category_h_week_no = temp_df_category[~temp_df_category.has_holiday_in_week]
if len(temp_df_category_h_week) > 0:
plt.scatter( 'week_number_of_year', 'item_cnt_day', data=temp_df_category_h_week, marker='o', color='green', linewidth=1)
if len(temp_df_category_h_week_no) > 0:
plt.scatter( 'week_number_of_year', 'item_cnt_day', data=temp_df_category_h_week_no, marker='o',color='red', linewidth=1)
plt.plot( 'week_number_of_year', 'item_cnt_day', data=temp_df_category, color='black', linewidth=1)
#plt.legend()
fig.suptitle('Holiday impacted on volume of sale per year',\
fontweight="bold",fontsize=24)
fig.tight_layout()
fig.subplots_adjust(top=0.93)
temp_df_category = train_df.groupby(['date','adjusted_is_holiday']).agg({'item_cnt_day':'sum'}).reset_index()
temp_df_category = temp_df_category.sort_values('date')
temp_df_category.head()
a = temp_df_category[temp_df_category.adjusted_is_holiday == 0.0]['item_cnt_day'].values
b = temp_df_category[temp_df_category.adjusted_is_holiday == 0.0]['date'].values
change_val_25 = [a[0]]
change_val_50 = [a[0]]
change_val_75 = [a[0]]
for i in range(1,len(a)):
val = a[0:i]
val_25 = np.percentile(val, 25)
val_50 = np.percentile(val, 50)
val_75 = np.percentile(val, 75)
change_val_25.append(val_25)
change_val_50.append(val_50)
change_val_75.append(val_75)
train_df_not_hol = temp_df_category[temp_df_category.adjusted_is_holiday == 0.0].assign(item_count_25TH=change_val_25,item_count_50TH=change_val_50,item_count_75TH=change_val_75)
a = temp_df_category[temp_df_category.adjusted_is_holiday == 1.0]['item_cnt_day'].values
b = temp_df_category[temp_df_category.adjusted_is_holiday == 1.0]['date'].values
change_val_25 = [a[0]]
change_val_50 = [a[0]]
change_val_75 = [a[0]]
for i in range(1,len(a)):
val = a[0:i]
val_25 = np.percentile(val, 25)
val_50 = np.percentile(val, 50)
val_75 = np.percentile(val, 75)
change_val_25.append(val_25)
change_val_50.append(val_50)
change_val_75.append(val_75)
train_df_hol = temp_df_category[temp_df_category.adjusted_is_holiday == 1.0].assign(item_count_25TH=change_val_25,item_count_50TH=change_val_50,item_count_75TH=change_val_75)
fig = plt.figure(figsize=(18, 15))
fig.subplots_adjust(hspace=0.3, wspace=0.4)
plot_values = ['item_count_25TH','item_count_50TH','item_count_75TH']
for i in range(1, 4):
ax = fig.add_subplot(3, 1, i)
ax.set_title("Holiday vs Non Holiday: "+plot_values[i-1],fontweight="bold",loc='center')
plt.scatter( 'date', plot_values[i-1], data=train_df_hol, c='black')
plt.scatter( 'date', plot_values[i-1], data=train_df_not_hol, c='green')
item_count_25th: As you can see the items that have less volume (i.e specialized items ) are sold more during holidays
item_count_75th: As you can see the items that have high volume (i.e more general items) are sold pretty much same on all days
fig = plt.figure(figsize=(18, 70))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
for i in range(1, 20):
ax = fig.add_subplot(16, 4, i)
temp_df_category = train_df[train_df.higher_category==higher_categories[i-1]]
ax.set_title("Category : "+str(higher_categories[i-1]),fontweight="bold",loc='center')
temp_df = temp_df_category.groupby(['date_block_num']).agg({'item_cnt_day':'sum'}).reset_index()
temp_df['differnetial_gs'] = temp_df.item_cnt_day.pct_change()
temp_df['differnetial_gs'] = temp_df['differnetial_gs'].fillna(0)
dF1 = temp_df[temp_df['differnetial_gs']>=0]
dF2 = temp_df[temp_df['differnetial_gs']<0]
if len(dF1) > 0:
plt.bar( 'date_block_num', 'differnetial_gs', data=dF1,color='green', linewidth=1)
if len(dF2) > 0:
plt.bar( 'date_block_num', 'differnetial_gs', data=dF2,color='red', linewidth=1)
if len(temp_df) > 0:
plt.plot( 'date_block_num', 'differnetial_gs', data=temp_df, marker='o', markerfacecolor='blue', markersize=2, color='black', linewidth=1)
#plt.legend()
fig.suptitle('Differential revenue - week by week per category',\
fontweight="bold",fontsize=24)
fig.tight_layout()
fig.subplots_adjust(top=0.96)